#==============================================================================#
# Geolocation of urban villages from Buringh (2020) list of cities
#==============================================================================#

rm(list = ls())
gc()

# Load libraries
library(tidyverse)
library(data.table)
library(tidytable)
library(progress)
library(readxl)
library(geonames)
#------------------------------------------------------------------------------#

options(geonamesUsername = "your user name in geonames.com")

dirs<-list(
  cities = "~/Replication package"
)

# Load and filter data
cities <- read_xlsx(paste0(dirs$cities, "./1_raw_data/1_2_geni/auxiliary_files/buringh.xlsx")) %>%
  as.data.table() %>%
  tidytable() %>%
  filter(country == "France") %>%
  filter(year == 1700)

#------------------------------------------------------------------------------#
# Prepare list of unique cities to geocode
locs <- cities %>%
  filter(country == "France") %>%
  select(city) %>%
  distinct() %>%
  pull(city)

# Initialize empty table for results
out <- data.table()

# Loop to geocode
for (i in seq_along(locs)) {
  city_name <- locs[i]
  print(city_name)
  
  ref <- tryCatch({
    GNsearch(name_equals = city_name, country = "FR", featureClass = "P", continentCode = "EU")
  }, error = function(e) {
    warning(paste("Error for:", city_name))
    return(NULL)
  })
  
  if (!is.null(ref) && nrow(ref) > 0) {
    ref$loc <- city_name  # ✅ Ensure loc is added here
    out <- bind_rows(out, as.data.table(ref))
  } else {
    # In case no results, still preserve the city name
    out <- bind_rows(out, data.table(loc = city_name))
  }
}

# Check structure
print(names(out))  # Check if "loc" is present

# Make sure loc exists before grouping
if (!"loc" %in% names(out)) {
  stop("Column 'loc' is missing from 'out'. Check geocoding loop.")
}

# Remove duplicates: keep only first row per city (loc)
out <- out %>%
  group_by(loc) %>%
  mutate(n = row_number()) %>%
  filter(n == 1) %>%
  ungroup() %>%
  select(-n)

# Add flag for checking join
locs_df <- cities %>%
  select(loc = city) %>%
  distinct() %>%
  mutate(flag = 1)

# Join and validate
out <- left_join(out, locs_df, by = "loc")
stopifnot(nrow(out) == nrow(filter(out, flag == 1)))
out <- select(out, -flag)

# Add city metadata from original dataset
out <- mutate(out, city = loc) %>%
  left_join(
    select(cities, city, city_lat = `latitude in degrees`, city_lon = `longitude in degrees`,
            pop = `inhabitants in 000-s`)
  )

setnames(out, old = c("city", "city_lat", "city_lon"),
         new = c("geonames_place", "geonames.lat", "geonames.lng"))

# Export georeferenced cities
fwrite(out, paste0(dirs$cities, "./1_raw_data/1_2_geni/auxiliary_files/fr-cities-manual-fix.csv"))
